import scrapy
from ..items import HeartItem


class CADSpider(scrapy.Spider):
    name = "cad_spider"
    start_urls = ["http://heart.dxy.cn/tag/CAD/p-1"]

    def parse(self, response):
        article_links = response.xpath("//p[contains(@class,'title')]/a[contains(@class,'h4')]/@href").extract()
        for article_link in article_links:
            # self.log(article_link)
            yield scrapy.Request(article_link, callback=self.parse_article)

        # 下一页链接
        links = response.xpath("//div[contains(@class,'el_page x_page1')]//a[contains(@title,'下一页')]/@href").extract()
        if links:
            next_link = links[0]
            yield scrapy.Request(next_link, callback=self.parse)

    def parse_article(self, response):
        try:
            article_items = HeartItem()
            article_items["url"] = response.url
            article_items["tag"] = response.xpath("string(//a[contains(@class,'channel_name')])").extract()[0]
            article_items["article_id"] = response.url.split('/')[-1]
            article_items["title"] = response.xpath("//h1/text()").extract()[0]
            article_items["date"] = response.xpath("//div[contains(@class,'sum')]/span/text()").extract()[0].strip()
            article_items["source"] = response.xpath("string(//div[contains(@class,'sum')]/span[2])").extract()[0]
            article_items["author"] = response.xpath("//div[contains(@class,'sum')]/span/text()").extract()[2].strip()
            article_items["content"] = response.xpath("//div[@id='content']/p/descendant::text()").extract()
        finally:
            yield article_items
